package data_deepprocessing.algorithm.ssvm.ssvm_util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import data_deepprocessing.prepareData.beans.Node2vec_WordBookBean;
import data_deepprocessing.prepareData.beans.XianBingShi_new_zhangBean;
import data_deepprocessing.prepareData.db.InitSeedDB;
import data_deepprocessing.prepareData.db.Node2vec_WordBookDB;
import data_deepprocessing.prepareData.db.XianBingShi_New_zhangDB;
import data_deepprocessing.util.FilePathUtil;
import data_deepprocessing.util.FileUtil;

/** 
* @author  作者 : YUHU YUAN
* @date 创建时间：2017年4月10日 下午8:06:30 
* @version 1.0  
* 他跟word2vec的区别就是多一个读vector文件的过程
*/

public class CRFplusCrossValidService2Node2vec {
	
	private InitSeedDB initSeedDB;
	private XianBingShi_New_zhangDB xianBingShi_New_zhangDB;
	private Node2vec_WordBookDB node2vec_WordBookDB;
	
	
	/**
	 * 
	 * 以下是添加的步骤
	 */
	private List<Node2vec_WordBookBean> doGetWordBookBeans(){
		return node2vec_WordBookDB.selectAllWordBook2Phrase();
	}
	
	private static Map<Integer, String> Num_WordMap = new HashMap<>();
	
	private static Map<String, String> Word_VectorMap = new HashMap<>();
	
	
	private void doBuildWord_VectorMap() throws IOException{
		List<Node2vec_WordBookBean> node2vec_WordBookBeans = doGetWordBookBeans();
		for(Node2vec_WordBookBean bean : node2vec_WordBookBeans){
			Num_WordMap.put(bean.getNum(), bean.getWord());
		}
		
		BufferedReader reader =  FileUtil.getReader(FilePathUtil.node2vec_vectorPath);
		String line = "";
		line = reader.readLine();//第一行不是向量数据不要
		while((line = reader.readLine())!=null){
			String[] tempStrings  = line.split(" ", 2);
			Word_VectorMap.put(Num_WordMap.get(Integer.valueOf(tempStrings[0])), tempStrings[1]);
		}
	}
	
	
	
	/**
	 * 
	* 以上为添加的步骤
	 */
	
	
	private List<XianBingShi_new_zhangBean> doGetSegmentedXianBingShi(){
		return xianBingShi_New_zhangDB.selectALlXianBingShi();
	}
	
	private List<String> doGetSymptom_Dictionary(){
		return initSeedDB.selectAllSeedContent();
	}
	
	
	private void divideDataset(){
		List<XianBingShi_new_zhangBean> beanList = doGetSegmentedXianBingShi();
		while (!beanList.isEmpty()) {
			int size = beanList.size();
			int index = (int) (Math.random() * size);
			XianBingShi_new_zhangBean tmp = beanList.get(index);
			divided_dataset.get(size % 10).add(tmp);
			beanList.remove(index);
		}
		
	}
	
	
	private Map<Integer, List<XianBingShi_new_zhangBean>> divided_dataset = initMap();
	
	private Map<Integer, List<XianBingShi_new_zhangBean>> initMap(){
		Map<Integer, List<XianBingShi_new_zhangBean>> divided_dataset = new HashMap<>();
		for (int i = 0; i < 10; i++) {
			List<XianBingShi_new_zhangBean> fold = new ArrayList<XianBingShi_new_zhangBean>();
			divided_dataset.put(i, fold);
		}
		return divided_dataset;
	}
	
	
	
	private void doGenerateCRFDataSet(String path,List<XianBingShi_new_zhangBean> xianbingshiBeanList) throws IOException{
		BufferedWriter writer = FileUtil.getWriter(path);
		List<String> symptom_dictionary = doGetSymptom_Dictionary(); //这里换成map会不会查找的更快
		StringBuffer constructTerm = new StringBuffer();
		for(XianBingShi_new_zhangBean bean : xianbingshiBeanList){
			String segmentedXBS = bean.getContent_segmented();
			if(segmentedXBS.length()<20){
				continue;
			}
			String[] wordOrPhrases = segmentedXBS.trim().split("[\\s]+");
			for(String word:wordOrPhrases){
				if(word==null && word.equals("")&&word.trim().equals("")){
					continue;
				}
				String[] voctor = Word_VectorMap.get(word).split(" ");
				
				String tag = (symptom_dictionary.contains(word)?1:2)+"";
				constructTerm.append(tag).append("\t");
				if(voctor!=null ){
					for(int i=0; i<voctor.length; ++i){
						constructTerm.append((i+1)+":"+voctor[i]).append("\t");
					}
				}else{
					for(int i=0; i<200; ++i){
						constructTerm.append((i+1)+":"+1).append("\t");
					}
				}
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				constructTerm.setLength(0);
			}
			writer.newLine();
		}
		if(writer!=null){
			writer.close();
		}
		
	}
	
	
	private static String path = "D:\\yyh_yuanyuhu_graduation_experimental\\node2vec_CRF++\\dataset1\\";
	
	
	public void doGenerateCRFCrossValidDataSet(){
		try {
			doBuildWord_VectorMap();
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		divideDataset();
		
		for(int i=0 ; i<divided_dataset.size(); ++i){
			StringBuffer testPath = new StringBuffer(path).append("data")
					.append(i).append(File.separator).append("test")
					.append(File.separator).append("test.crfsuite.txt");
			List<XianBingShi_new_zhangBean> testDataSet = divided_dataset.get(i);
			try {
				doGenerateCRFDataSet(testPath.toString(),testDataSet);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			StringBuffer trainPath = new StringBuffer(path).append("data")
					.append(i).append(File.separator).append("train")
					.append(File.separator).append("train.crfsuite.dat");
			List<XianBingShi_new_zhangBean> trainDataSet = new ArrayList<>();
			for(int j=0; j<divided_dataset.size(); ++j){
				if(j!=i){
					trainDataSet.addAll(divided_dataset.get(j));
				}
			}
			try {
				doGenerateCRFDataSet(trainPath.toString(),trainDataSet);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			testPath.setLength(0);
			trainPath.setLength(0);
		}
		
	}
	
	
	
	
	public InitSeedDB getInitSeedDB() {
		return initSeedDB;
	}

	public void setInitSeedDB(InitSeedDB initSeedDB) {
		this.initSeedDB = initSeedDB;
	}

	public XianBingShi_New_zhangDB getXianBingShi_New_zhangDB() {
		return xianBingShi_New_zhangDB;
	}

	public void setXianBingShi_New_zhangDB(XianBingShi_New_zhangDB xianBingShi_New_zhangDB) {
		this.xianBingShi_New_zhangDB = xianBingShi_New_zhangDB;
	}

	public Node2vec_WordBookDB getNode2vec_WordBookDB() {
		return node2vec_WordBookDB;
	}

	public void setNode2vec_WordBookDB(Node2vec_WordBookDB node2vec_WordBookDB) {
		this.node2vec_WordBookDB = node2vec_WordBookDB;
	}
	

}








